package com.kdtech.analyse.NewsPaper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.StringUtils;
import com.kdtech.analyse.AnalyseNews;

public class CnhubeiNewsPaperAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		boolean bRet = false;
		//http://ctdsb.cnhubei.com/HTML/ctdsb/20121203/ctdsb22.html
		String[] regex = {
				"http://news.cnhubei.com/xw/gn/[0-9]*/t[0-9]*.shtml",
				"http://hbrb.cnhubei.com/(HTML|html)/hbrb/[0-9]*/hbrb[0-9]{3,10}.html",
				"http://ctdsb.cnhubei.com/(HTML|html)/ctdsb/[0-9]*/ctdsb[0-9]{3,10}.html",
				"http://ctjb.cnhubei.com/(HTML|html)/ctjb/[0-9]*/ctjb[0-9]{3,10}.html",
				"http://ncxb.cnhubei.com/(HTML|html)/ncxb/[0-9]*/ncxb[0-9]{3,10}.html",
				"http://sxwb.cnhubei.com/(HTML|html)/sxwb/[0-9]*/sxwb[0-9]{3,10}.html"
				};
		//http://sxwb.cnhubei.com/html/sxwb/20161101/sxwb2996927.html
		//http://news.cnhubei.com/xw/gn/201611/t3733832.shtml
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta newspaper = new NewsMeta();
		String html = urlMeta.getHtml();
		String url = urlMeta.getUrl();
	
		String title = null;
		String content = null;
		Long date = null;
		Document doc = Jsoup.parse(html);

		title =doc.select("title").text();
		date=DateUtils.matchDate(url);
        content = HtmlCleaner.getContentHtml(url,doc.select("div#copytext"));

        title= StringUtils.substringBefore(title, "_");

		newspaper.setUrl(url);
		newspaper.setTitle(StringUtils.trimSpace(title));
		newspaper.setContent(StringUtils.trimSpace(content));
		newspaper.setDate(date);
		return newspaper;

	}

	
	public boolean isTaskPage(String url) {
		boolean bRet = false;
		String[] regex = {
				"http://hbrb.cnhubei.com/(HTML|html)/hbrb/[0-9]{8}/",
				"http://hbrb.cnhubei.com/(HTML|html)/ctjb/[0-9]{8}/",
				"http://ctdsb.cnhubei.com/(HTML|html)/ctdsb/[0-9]{8}/",
				"http://ctdsb.cnhubei.com/(HTML|html)/ctdsb/[0-9]{8}/ctdsb[0-9]{1,2}.html",
				"http://ncxb.cnhubei.com/(HTML|html)/ncxb/[0-9]{8}/ncxb[0-9]{1,2}.html",
				"http://ncxb.cnhubei.com/(HTML|html)/ncxb/[0-9]{8}/[index.html]?",
				"http://sxwb.cnhubei.com/(HTML|html)/sxwb/[0-9]{8}/sxwb[0-9]{1,2}.html",
				"http://sxwb.cnhubei.com/(HTML|html)/sxwb/[0-9]{8}/(index.html|sxwb封二.html)?"
				};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
}
